{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CS189/289A HW3 code part2\n",
    "### Yicheng Chen\n",
    "### 02/15/2017"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import scipy.io\n",
    "import random\n",
    "from random import shuffle\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from skimage.feature import hog\n",
    "import pandas as pd\n",
    "import cv2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "## (a) fit Gaussian distribution to each digit class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "mnist = scipy.io.loadmat('mnist/train.mat')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "training_X = mnist['trainX'][:, :-1]\n",
    "training_y = mnist['trainX'][:, -1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def normalize_row(X):\n",
    "    Xn = np.zeros(X.shape)\n",
    "    for i in range(X.shape[0]):\n",
    "        x = X[i, :]\n",
    "        Xn[i, :] = (x/(np.sqrt(x.dot(x))+1e-15))\n",
    "    return Xn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "training_Xn = normalize_row(training_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for digit in range(1, 2):\n",
    "    training_digit = training_Xn[training_y==digit, :]\n",
    "    mean = training_digit.mean(axis=0)\n",
    "    cov = np.cov(training_digit.T)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "## (b) Visualize the covariance matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.imshow(cov)\n",
    "plt.colorbar()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "## (c) Classify the digits in the test set on the basis of posterior probabilities with two different approaches"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (i) LDA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "class LDA:\n",
    "    def __init__(self):\n",
    "        self.Mean = 0\n",
    "        self.Cov = 0\n",
    "        self.P = 0\n",
    "    \n",
    "    def fit(self, X, y):\n",
    "        N = len(y)\n",
    "        labels = set(y)\n",
    "        Cov = 0\n",
    "        Mean = np.zeros([len(labels), X.shape[1]])\n",
    "        P = np.zeros([len(labels)])\n",
    "        for i, label in enumerate(labels):\n",
    "            X_class = X[y==label, :]\n",
    "            Nc = X_class.shape[0]\n",
    "            P[i] = Nc / N\n",
    "            Mean[i, :] = X_class.mean(axis=0)\n",
    "            Cov += np.cov(X_class.T) * Nc\n",
    "        Cov = Cov / N\n",
    "        self.Mean = Mean\n",
    "        self.Cov = Cov\n",
    "        self.P = P\n",
    "    \n",
    "    def predict(self, X):\n",
    "        PreMat = np.linalg.pinv(self.Cov)\n",
    "        L = self.Mean.dot(PreMat).dot(X.T).T \\\n",
    "            - 1/2*np.diag(self.Mean.dot(PreMat).dot(self.Mean.T)) \\\n",
    "            - np.log(self.P)\n",
    "        yp = np.argmax(L.T, axis=0)\n",
    "        return yp\n",
    "    \n",
    "    def accuracy(self, X, y):\n",
    "        yp = self.predict(X)\n",
    "        N = len(y)\n",
    "        Nerror = (yp != y).sum()\n",
    "        return 1 - Nerror/N\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "mnist = scipy.io.loadmat('mnist/train.mat')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "training = mnist['trainX']\n",
    "shuffle(training)\n",
    "validation = training[0:10000]\n",
    "training = training[10000:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "training_X = normalize_row(training[:, :-1])\n",
    "training_y = training[:, -1]\n",
    "validation_X = normalize_row(validation[:, :-1])\n",
    "validation_y = validation[:, -1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "subset_size = np.array([100, 200, 500, 1000, 2000, 5000, 10000, 30000, 50000])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "all_accuracy = np.zeros(len(subset_size))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for i, N in enumerate(subset_size):\n",
    "    idx = random.sample(range(50000), N)\n",
    "    lda = LDA()\n",
    "    lda.fit(training_X[idx, :], training_y[idx])\n",
    "    all_accuracy[i] = lda.accuracy(validation_X, validation_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.plot(subset_size, all_accuracy * 100)\n",
    "plt.xscale('log')\n",
    "plt.xlabel('Training set size')\n",
    "plt.ylabel('Accuracy(%)')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (ii) QDA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "class QDA:\n",
    "    def __init__(self, a):\n",
    "        self.Mean = 0\n",
    "        self.Cov = 0\n",
    "        self.P = 0\n",
    "        self.a = a\n",
    "        \n",
    "    def Q(self, X, m, C, p):\n",
    "        PreMat = np.linalg.inv(C)\n",
    "        Const = - 1/2*np.log(np.linalg.det(C)+1e-20) + np.log(p)\n",
    "        Qc = np.array([(-1/2*(x-m).dot(PreMat).dot(x-m)) + Const for x in X])\n",
    "        return Qc\n",
    "    \n",
    "    def fit(self, X, y):\n",
    "        N = len(y)\n",
    "        d = X.shape[1]\n",
    "        labels = set(y)\n",
    "        Cov = np.zeros([X.shape[1], d, len(labels)])\n",
    "        Mean = np.zeros([len(labels), d])\n",
    "        P = np.zeros([len(labels)])\n",
    "        for i, label in enumerate(labels):\n",
    "            X_class = X[y==label, :]\n",
    "            Nc = X_class.shape[0]\n",
    "            P[i] = Nc / N\n",
    "            Mean[i, :] = X_class.mean(axis=0)\n",
    "            Cov[:, :, i] = np.cov(X_class.T) + self.a*np.eye(d)\n",
    "        self.Mean = Mean\n",
    "        self.Cov = Cov\n",
    "        self.P = P\n",
    "    \n",
    "    def predict(self, X):\n",
    "        nC = len(self.P)\n",
    "        L = np.zeros([nC, len(X)])\n",
    "        for i in range(nC):\n",
    "            L[i, :] = self.Q(X, self.Mean[i], self.Cov[:, :, i], self.P[i])\n",
    "        yp = np.argmax(L, axis=0)\n",
    "        return yp\n",
    "    \n",
    "    def accuracy(self, X, y):\n",
    "        yp = self.predict(X)\n",
    "        N = len(y)\n",
    "        Nerror = (yp != y).sum()\n",
    "        return 1-Nerror/N\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "subset_size = np.array([100, 200, 500, 1000, 2000, 5000, 10000, 30000, 50000])\n",
    "all_accuracy = np.zeros(len(subset_size))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for i, N in enumerate(subset_size):\n",
    "    idx = random.sample(range(50000), N)\n",
    "    qda = QDA(1e-8)\n",
    "    qda.fit(training_X[idx, :], training_y[idx])\n",
    "    all_accuracy[i] = qda.accuracy(validation_X, validation_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.plot(subset_size, all_accuracy * 100)\n",
    "plt.xscale('log')\n",
    "plt.xlabel('Training set size')\n",
    "plt.ylabel('Accuracy(%)')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (iii) which perform better?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "(Written answer.) QDA"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (iv) Kaggle "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Calculate the HOG feature\n",
    "def hog_feature(data):\n",
    "    data_hog = np.zeros(data.shape)\n",
    "    for i in range(0, len(data)):\n",
    "        feature = data[i, :]\n",
    "        image = feature.reshape(28, 28)\n",
    "        fd, hog_image = hog(image, orientations=8, pixels_per_cell=(4, 4), cells_per_block=(1, 1), visualise=True)\n",
    "        data_hog[i, :] = hog_image.ravel()\n",
    "    return data_hog"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "mnist = scipy.io.loadmat('mnist/train.mat')\n",
    "test = scipy.io.loadmat('mnist/test.mat')\n",
    "training_X = mnist['trainX'][:, :-1]\n",
    "training_y = mnist['trainX'][:, -1]\n",
    "test_X = test['testX']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "training_Xh = normalize_row(hog_feature(training_X))\n",
    "test_Xh = normalize_row(hog_feature(test_X))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "training_Xcombined = np.concatenate((normalize_row(training_X), training_Xh), axis=1)\n",
    "test_Xcombined = np.concatenate((normalize_row(test_X), test_Xh), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "std = np.std(training_Xcombined, axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# use the max N variance word to build model\n",
    "# max_idx = std.argsort()[:]\n",
    "# training_X_max = training_Xcombined[:, max_idx]\n",
    "# test_X_max = test_Xcombined[:, max_idx]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.96156666666666668"
      ]
     },
     "execution_count": 159,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lda = LDA()\n",
    "lda.fit(training_Xcombined, training_y)\n",
    "\n",
    "yp_lda = lda.predict(test_Xcombined)\n",
    "lda.accuracy(training_Xcombined, training_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df = pd.DataFrame({'Category': yp_lda})\n",
    "df.index.rename('Id', inplace=True)\n",
    "df.to_csv('mnist/mnist_predict_org+hog_lda.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "## (d) Spam"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "spam = scipy.io.loadmat('spam/spam_data.mat')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Bag-of-Words feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "import glob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "SPAM_DIR = 'spam/'\n",
    "HAM_DIR = 'ham/'\n",
    "TEST_DIR = 'test/'\n",
    "NUM_TEST_EXAMPLES = 10000\n",
    "spam_filenames = glob.glob('spam/' + SPAM_DIR + '*.txt')\n",
    "ham_filenames = glob.glob('spam/' + HAM_DIR + '*.txt')\n",
    "test_filenames = ['spam/' + TEST_DIR + str(x) + '.txt' for x in range(NUM_TEST_EXAMPLES)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "all_text = []\n",
    "for file in spam_filenames+ham_filenames: # use only training set data to build BOG\n",
    "    with open(file, \"r\", encoding='utf-8', errors='ignore') as f:\n",
    "        all_text.append(f.read())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "all_test_text = []\n",
    "for file in test_filenames: # use only training set data to build BOG\n",
    "    with open(file, \"r\", encoding='utf-8', errors='ignore') as f:\n",
    "        all_test_text.append(f.read())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "vectorizer = CountVectorizer(min_df=4) # min word length=4\n",
    "training_X = normalize_row(vectorizer.fit_transform(all_text).toarray())\n",
    "test_X = normalize_row(vectorizer.transform(all_test_text).toarray())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "training_y = np.concatenate((np.ones(len(spam_filenames)), np.zeros(len(ham_filenames))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "std = np.std(training_X, axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# use the max 2000 variance word to build model\n",
    "max_idx = std.argsort()[-2000:]\n",
    "training_X_max = training_X[:, max_idx]\n",
    "test_X_max = test_X[:, max_idx]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.98886169943464686"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lda = LDA()\n",
    "lda.fit(training_X_max, training_y)\n",
    "lda.accuracy(training_X_max, training_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Kaggle submission\n",
    "yp = lda.predict(test_X_max)\n",
    "df = pd.DataFrame({'Category': yp})\n",
    "df.index.rename('Id', inplace=True)\n",
    "df.to_csv('spam/spam_BOW_predict.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## (e) For Experts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# max 10 variance words\n",
    "max_idx = std.argsort()[-10:]\n",
    "training_X_max = training_X[:, max_idx]\n",
    "test_X_max = test_X[:, max_idx]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7610328242342419"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lda = LDA()\n",
    "lda.fit(training_X_max, training_y)\n",
    "lda.accuracy(training_X_max, training_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# min 10 variance words\n",
    "min_idx = std.argsort()[:10]\n",
    "training_X_min = training_X[:, min_idx]\n",
    "test_X_min = test_X[:, min_idx]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.49308075267909879"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lda = LDA()\n",
    "lda.fit(training_X_min, training_y)\n",
    "lda.accuracy(training_X_min, training_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
